print(studies)
## [1] "BIDMC-FMT"
## [2] "CS-PRISM"
## [3] "Herfarth_CCFA_Microbiome_3B_combined"
## [4] "HMP2"
## [5] "Jansson_Lamendella_Crohns"
## [6] "LSS-PRISM"
## [7] "MucosalIBD"
## [8] "Pouchitis"
## [9] "PROTECT"
## [10] "RISK"
l_otus <- l_species <- l_genera <- list()
for(study in studies) {
load(paste0("data/phyloseq/",
study,
"/otus.RData"))
load(paste0("data/phyloseq/",
study,
"/species.RData"))
load(paste0("data/phyloseq/",
study,
"/genera.RData"))
l_otus[[study]] <- phylo
l_species[[study]] <- phylo_species
l_genera[[study]] <- phylo_genera
}
phylo_otus <- combine_phyloseq(l_otus)
phylo_species <- combine_phyloseq(l_species)
phylo_genera <- combine_phyloseq(l_genera)
tb_long <- l_otus %>%
purrr::map_dfr(
function(phylo_otus) {
tb_ra_otus <- phylo_otus %>%
transform_sample_counts(tss) %>%
phyloseq_to_tb()
tb_otu_taxonamy <- phylo_otus %>%
tax_table2() %>%
tibble::as_tibble(rownames = "feature") %>%
dplyr::mutate(classification = dplyr::case_when(
Rank7 != "s__" ~ "fully classified",
Rank1 == "k__" ~ "unclassified at kingdom",
Rank2 == "p__" ~ "unclassified at phylumn",
Rank3 == "c__" ~ "unclassified at class",
Rank4 == "o__" ~ "unclassified at order",
Rank5 == "f__" ~ "unclassified at family",
Rank6 == "g__" ~ "unclassified at genus",
Rank7 == "s__" ~ "unclassified at species"
) %>%
factor(levels= c("fully classified",
"unclassified at species",
"unclassified at genus",
"unclassified at family",
"unclassified at order",
"unclassified at class",
"unclassified at phylumn",
"unclassified at kingdom")))
tb_ra_otus %>%
dplyr::left_join(tb_otu_taxonamy,
by = c("feature", "Rank1", "Rank2", "Rank3",
"Rank4", "Rank5", "Rank6", "Rank7"))
}
)
p <- tb_long %>%
dplyr::group_by(dataset_name, feature, classification) %>% dplyr::filter(!duplicated(feature)) %>% dplyr::summarise(n = n()) %>%
ggplot(aes(x = dataset_name, y = n, fill = classification)) +
geom_bar(stat = "identity") +
rotate_xaxis(30) +
ggtitle("OTUs are mostly classified at order, family, genus, and species level")
print(p) %>%
ggsave(paste0(dir_output, "fig_OTUClassification.pdf"),
.,
width = 8,
height = 5)
p <- tb_long %>%
dplyr::group_by(dataset_name, rownames, classification) %>% dplyr::summarise(abundance = sum(abundance)) %>% dplyr::ungroup() %>%
dplyr::group_by(dataset_name, rownames) %>% dplyr::mutate(abundance_fullyClassified =
sum(abundance[classification %in% c("fully classified",
"unclassified at species")])) %>% dplyr::ungroup() %>%
dplyr::arrange(dataset_name, abundance_fullyClassified) %>% dplyr::mutate(rownames = factor(rownames, levels = unique(rownames))) %>%
ggplot(aes(x = rownames, y = abundance, fill = classification)) +
geom_bar(stat = "identity") +
facet_grid(.~dataset_name, scales = "free_x", space = "free_x") +
no_label_xaxis() +
ggtitle("OTUs are most abundantly classified at family, genus, and species level.")
print(p) %>%
ggsave(paste0(dir_output, "fig_speciesClassificationAbd.pdf"),
.,
width = 20,
height = 6)
tb_long_speciesUnclassified <- tb_long %>%
dplyr::filter(classification == "unclassified at species") %>%
dplyr::group_by(dataset_name, rownames, Rank6, Rank7) %>% dplyr::summarise(abundance = sum(abundance)) %>% dplyr::ungroup() %>%
dplyr::group_by(Rank6, Rank7) %>% dplyr::mutate(mean_abundance = mean(abundance)) %>% dplyr::ungroup() %>%
dplyr::mutate(taxonamy = ifelse(mean_abundance >= sort(unique(mean_abundance), decreasing = TRUE)[10],
paste(Rank6, Rank7, sep = "|"), "others")) %>%
dplyr::arrange(dplyr::desc(mean_abundance)) %>% dplyr::mutate(taxonamy = factor(taxonamy, levels = unique(taxonamy))) %>%
dplyr::group_by(dataset_name, rownames) %>% dplyr::mutate(all_abundance = sum(abundance)) %>% dplyr::ungroup() %>%
dplyr::arrange(dataset_name, all_abundance) %>% dplyr::mutate(rownames = factor(rownames, levels = unique(rownames)))
p1 <- tb_long_speciesUnclassified %>%
ggplot(aes(x = rownames, y = abundance, fill = taxonamy)) +
geom_bar(stat = "identity") +
facet_grid(.~dataset_name, scales = "free_x", space = "free_x") +
no_label_xaxis() +
ggtitle("Abundance of OTUs unclassified at species level")
p2 <- tb_long %>%
dplyr::filter(classification == "unclassified at species") %>%
dplyr::group_by(dataset_name) %>%
dplyr::filter(!duplicated(feature)) %>%
dplyr::ungroup() %>%
dplyr::mutate(taxonamy = paste(Rank6, Rank7, sep = "|")) %>%
dplyr::filter(taxonamy %in% levels(tb_long_speciesUnclassified$taxonamy)) %>%
dplyr::mutate(taxonamy = factor(taxonamy, levels = levels(tb_long_speciesUnclassified$taxonamy))) %>%
dplyr::group_by(dataset_name, taxonamy) %>%
dplyr::summarise(n_otus = n()) %>%
ggplot(aes(x = dataset_name, y = n_otus, fill = taxonamy)) +
geom_bar(stat = "identity") +
rotate_xaxis(30)
cowplot::plot_grid(p1, p2, nrow = 1, rel_widths = c(0.8, 0.2)) %>%
ggsave(paste0(dir_output, "fig_speciesUnclassified.pdf"),
.,
width = 25,
height = 6)
tb_long_generaUnclassified <- tb_long %>%
dplyr::filter(classification == "unclassified at genus") %>%
dplyr::group_by(dataset_name, rownames, Rank5, Rank6) %>% dplyr::summarise(abundance = sum(abundance)) %>% dplyr::ungroup() %>%
dplyr::group_by(Rank5, Rank6) %>% dplyr::mutate(mean_abundance = mean(abundance)) %>% dplyr::ungroup() %>%
dplyr::mutate(taxonamy = ifelse(mean_abundance >= sort(unique(mean_abundance), decreasing = TRUE)[10],
paste(Rank5, Rank6, sep = "|"), "others")) %>%
dplyr::arrange(dplyr::desc(mean_abundance)) %>% dplyr::mutate(taxonamy = factor(taxonamy, levels = unique(taxonamy))) %>%
dplyr::group_by(dataset_name, rownames) %>% dplyr::mutate(all_abundance = sum(abundance)) %>% dplyr::ungroup() %>%
dplyr::arrange(dataset_name, all_abundance) %>% dplyr::mutate(rownames = factor(rownames, levels = unique(rownames)))
p1 <- tb_long_generaUnclassified %>%
ggplot(aes(x = rownames, y = abundance, fill = taxonamy)) +
geom_bar(stat = "identity") +
facet_grid(.~dataset_name, scales = "free_x", space = "free_x") +
no_label_xaxis() +
ggtitle("Abundance of OTUs unclassified at genera level")
p2 <- tb_long %>%
dplyr::filter(classification == "unclassified at genus") %>%
dplyr::group_by(dataset_name) %>%
dplyr::filter(!duplicated(feature)) %>%
dplyr::ungroup() %>%
dplyr::mutate(taxonamy = paste(Rank5, Rank6, sep = "|")) %>%
dplyr::filter(taxonamy %in% levels(tb_long_generaUnclassified$taxonamy)) %>%
dplyr::mutate(taxonamy = factor(taxonamy, levels = levels(tb_long_generaUnclassified$taxonamy))) %>%
dplyr::group_by(dataset_name, taxonamy) %>%
dplyr::summarise(n_otus = n()) %>%
ggplot(aes(x = dataset_name, y = n_otus, fill = taxonamy)) +
geom_bar(stat = "identity") +
rotate_xaxis(30)
p3 <- tb_long %>%
dplyr::filter(classification == "unclassified at genus",
Rank5 == "f__Enterobacteriaceae",
Rank6 == "g__") %>%
dplyr::group_by(dataset_name) %>%
dplyr::filter(!duplicated(feature)) %>%
dplyr::ungroup() %>%
dplyr::mutate(taxonamy = paste(Rank5, Rank6, sep = "|")) %>%
ggplot(aes(x = dataset_name)) +
geom_bar() +
rotate_xaxis(30) +
ggtitle("# unclassified f_Enterobacteriacea OTUs")
cowplot::plot_grid(p1, p2, p3, nrow = 1, rel_widths = c(4, 1, 1)) %>%
ggsave(paste0(dir_output, "fig_generaUnclassified.pdf"),
.,
width = 30,
height = 6)
tb_nTaxaSampleLibSize <- list(otus = l_otus,
species = l_species,
genera = l_genera) %>%
purrr::imap_dfr(function(l_phylo, feature_level) {
purrr::imap_dfr(l_phylo, function(phylo, study) {
tibble::tibble(
feature_level = feature_level,
study = study,
N_taxa = phyloseq::ntaxa(phylo),
N_samples = phyloseq::nsamples(phylo),
median_log10LibSize = phylo %>% phyloseq::sample_sums() %>%
add(1) %>% log(10) %>%
median()
)
})
})
p1 <- tb_nTaxaSampleLibSize %>%
ggplot(aes(x = N_samples,
y = N_taxa)) +
geom_point() +
ggrepel::geom_text_repel(aes(label = study)) +
facet_wrap(~factor(feature_level, levels = c("otus", "species", "genera")),
scales = "free_y")
p2 <- tb_nTaxaSampleLibSize %>%
ggplot(aes(x = median_log10LibSize,
y = N_taxa)) +
geom_point() +
ggrepel::geom_text_repel(aes(label = study)) +
facet_wrap(~factor(feature_level, levels = c("otus", "species", "genera")),
scales = "free_y")
cowplot::plot_grid(p1, p2, ncol = 1) %>%
cowplot_title("Alpha diversity across studies") %>%
ggsave(file = paste0(dir_output, "fig_alphaDiversity.pdf"),
width = 15,
heigh = 10)
list(otus = phylo_otus,
species = phylo_species,
genera = phylo_genera) %>%
purrr::iwalk(function(phylo, feature_level) {
tb_long <- phylo %>%
phyloseq::transform_sample_counts(tss) %>%
phyloseq_to_tb()
tb_presence <- tb_long %>%
dplyr::group_by(feature, dataset_name) %>%
dplyr::summarise(present = any(abundance > 0)) %>%
dplyr::group_by(feature) %>%
dplyr::mutate(n_present = sum(present),
n_absent = sum(!present))
tb_long <- tb_long %>%
dplyr::left_join(tb_presence,
by = c("feature", "dataset_name"))
tb_uniqPres <- tb_long %>%
dplyr::filter(n_present == 1, present) %>%
dplyr::group_by(feature, dataset_name) %>%
dplyr::summarise(mean_abundance = mean(abundance)) %>%
dplyr::ungroup()
tb_uniqAbs <- tb_long %>%
dplyr::filter(n_absent == 1, present) %>%
dplyr::group_by(feature) %>%
dplyr::summarise(mean_abundance = mean(abundance)) %>%
dplyr::ungroup() %>%
dplyr::left_join(
tb_presence %>%
dplyr::filter(n_absent == 1,
!present),
by = "feature"
) %>%
dplyr::select(feature, dataset_name, mean_abundance)
# Table summarises feature presence
tb_presence %>%
dplyr::filter(!duplicated(feature)) %>%
dplyr::group_by(n_present) %>%
dplyr::summarise(N = n()) %>%
t() %>%
DT::datatable(caption = feature_level) %>%
print()
# Figure uniquely present features
p <- tb_uniqPres %>%
dplyr::arrange(dataset_name, mean_abundance) %>%
dplyr::mutate(feature = factor(feature, levels = feature)) %>%
ggplot(aes(x = feature, y = mean_abundance, color = dataset_name)) + geom_point() +
rotate_xaxis(90)
ggsave(filename = paste0(dir_output, "fig_", feature_level, "UniquePresent.pdf"),
plot = p,
width = 40,
height = 10)
p <- tb_uniqPres %>%
dplyr::filter(mean_abundance > 1e-4) %>%
dplyr::arrange(dataset_name, mean_abundance) %>%
dplyr::mutate(feature = factor(feature, levels = feature)) %>%
ggplot(aes(x = feature, y = mean_abundance, color = dataset_name)) + geom_point() +
rotate_xaxis(90) + coord_flip()
p %>% print() %>%
ggsave(filename = paste0(dir_output, "fig_", feature_level, "UniquePresentZoom.pdf"),
plot = .,
width = 15,
height = 10)
# Figure uniquely absent features
p <- tb_uniqAbs %>%
dplyr::arrange(dataset_name, mean_abundance) %>%
dplyr::mutate(feature = factor(feature, levels = feature)) %>%
ggplot(aes(x = feature, y = mean_abundance, color = dataset_name)) + geom_point() +
rotate_xaxis(90) + coord_flip()
p %>% print() %>%
ggsave(filename = paste0(dir_output, "fig_", feature_level, "UniqueAbsent.pdf"),
plot = .,
width = 15,
height = 10)
})
## Individual OTUS
mat_tax_tmp <- tax_table2(l_species$Jansson_Lamendella_Crohns)
mat_tax_tmp %>%
tibble::as_tibble(rownames = "otu_cluster") %>%
dplyr::filter(Rank7 == "s__oncorhynchi")
mat_tax_tmp <- tax_table2(l_species$MucosalIBD)
mat_tax_tmp %>%
tibble::as_tibble(rownames = "otu_cluster") %>%
dplyr::filter(Rank7 == "s__blattae")
tax_table2(phylo_genera) %>%
tibble::as_tibble(rownames = "feature") %>%
dplyr::filter(Rank6 == "g__Escherichia")
# sanity check
tb_libSize <- list(otu = l_otus,
species = l_species,
genus = l_genera) %>%
purrr::imap_dfr(function(l_phylo, feature_level) {
l_phylo %>% purrr::imap_dfr(function(phylo, dataset_name) {
tibble::tibble(
sample_accession_16S = phyloseq::sample_names(phylo),
dataset_name = dataset_name,
lib_size = phyloseq::sample_sums(phylo),
feature_level = feature_level)
})
})
tb_libSize %>%
dplyr::select(dataset_name, sample_accession_16S, feature_level, lib_size) %>%
tidyr::spread(key = feature_level, value = lib_size) %>%
ggplot(aes(x = genus, y = otu)) + geom_point() + ggtitle("lib size otus vs. genera level")
tb_libSize <- tb_libSize %>% dplyr::filter(feature_level == "genus")
tb_readCounts <- studies %>%
purrr::map_dfr(~ paste0(dir_processed,
"processed/",
.x,
"/16s/all_samples_read_counts.tsv") %>%
readr::read_tsv(col_types = "cddd") %>%
dplyr::mutate(dataset_name = .x)) %>%
dplyr::rename(original_read_count = `original read count`,
reads_mapping_to_OTU_with_taxonomy = `reads mapping to OTU with taxonomy`,
reads_mapping_to_unclassifed_OTU = `reads mapping to unclassifed OTU`)
tb_libSize <- tb_libSize %>%
dplyr::left_join(tb_readCounts,
c("dataset_name" = "dataset_name",
"sample_accession_16S" = "# sample"))
tb_libSize %>%
ggplot(aes(x = lib_size, y = reads_mapping_to_OTU_with_taxonomy)) + geom_point() +
facet_wrap(~dataset_name) + ggtitle("The two should be the same")# the two are equal
tb_libSize %>%
tidyr::gather(key = "Type of read count",
value = "N",
original_read_count,
lib_size,
reads_mapping_to_OTU_with_taxonomy,
reads_mapping_to_unclassifed_OTU,
factor_key = TRUE) %>%
ggplot(aes(x = dataset_name, y = log10(N + 1), color = `Type of read count`)) +
geom_boxplot() + rotate_xaxis(90)
ggsave(filename = paste0(dir_output, "fig_libSize.pdf"),
width = 10,
height = 8)
p1 <- tb_libSize %>%
ggplot(aes(x = dataset_name, fill = (lib_size > 3000))) +
geom_bar(stat = "count") +
rotate_xaxis(30) +
ggtitle("Cutoff = 3000")
p2 <- tb_libSize %>%
ggplot(aes(x = dataset_name, fill = (lib_size > 1000))) +
geom_bar(stat = "count") +
rotate_xaxis(30) +
ggtitle("Cutoff = 1000")
cowplot::plot_grid(p1, p2, ncol = 1) %>%
cowplot_title(title = "Lib size filtering cut off?") %>% print() %>%
ggsave(filename = paste0(dir_output, "fig_libSizeCutoff.pdf"),
width = 10,
height = 8)
sample_data(phylo_species) <-
sample_data2(phylo_species) %>%
tibble::rownames_to_column() %>%
dplyr::left_join(tb_readCounts,
by = c("dataset_name" = "dataset_name",
"sample_accession_16S" = "# sample")) %>%
tibble::column_to_rownames()
sample_data(phylo_genera) <-
sample_data2(phylo_genera) %>%
tibble::rownames_to_column() %>%
dplyr::left_join(tb_readCounts,
by = c("dataset_name" = "dataset_name",
"sample_accession_16S" = "# sample")) %>%
tibble::column_to_rownames()
print(phylo_genera)
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 1149 taxa and 6282 samples ]
## sample_data() Sample Data: [ 6282 samples by 55 sample variables ]
## tax_table() Taxonomy Table: [ 1149 taxa by 8 taxonomic ranks ]
phylo_genera <- phylo_genera %>% phyloseq::subset_samples(!is.na(body_site) & !is.na(disease))
print(phylo_genera)
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 1149 taxa and 5248 samples ]
## sample_data() Sample Data: [ 5248 samples by 55 sample variables ]
## tax_table() Taxonomy Table: [ 1149 taxa by 8 taxonomic ranks ]
phylo_genera <- phylo_genera %>%
prune_taxaSamples(flist_taxa = kOverA2(k = phyloseq::nsamples(phylo_genera) * 0.01,
A = 1e-4),
flist_samples = function(x) sum(x) > 3000)
print(phylo_genera)
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 541 taxa and 4872 samples ]
## sample_data() Sample Data: [ 4872 samples by 55 sample variables ]
## tax_table() Taxonomy Table: [ 541 taxa by 8 taxonomic ranks ]
print(phylo_species)
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 1391 taxa and 6282 samples ]
## sample_data() Sample Data: [ 6282 samples by 55 sample variables ]
## tax_table() Taxonomy Table: [ 1391 taxa by 8 taxonomic ranks ]
phylo_species <- phylo_species %>%
phyloseq::prune_samples(phyloseq::sample_names(phylo_genera), .) %>%
phyloseq::filter_taxa(kOverA2(k = phyloseq::nsamples(phylo_genera) * 0.01,
A = 1e-4),
prune = TRUE)
print(phylo_species)
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 661 taxa and 4872 samples ]
## sample_data() Sample Data: [ 4872 samples by 55 sample variables ]
## tax_table() Taxonomy Table: [ 661 taxa by 8 taxonomic ranks ]
print(phylo_otus)
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 9259 taxa and 6282 samples ]
## sample_data() Sample Data: [ 6282 samples by 52 sample variables ]
## tax_table() Taxonomy Table: [ 9259 taxa by 8 taxonomic ranks ]
phylo_otus <- phylo_otus %>%
phyloseq::prune_samples(phyloseq::sample_names(phylo_genera), .) %>%
phyloseq::filter_taxa(kOverA2(k = phyloseq::nsamples(phylo_genera) * 0.01,
A = 1e-4),
prune = TRUE)
print(phylo_otus)
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 4073 taxa and 4872 samples ]
## sample_data() Sample Data: [ 4872 samples by 52 sample variables ]
## tax_table() Taxonomy Table: [ 4073 taxa by 8 taxonomic ranks ]
save(phylo_genera, file = "data/phyloseq/genera.RData")
save(phylo_species, file = "data/phyloseq/species.RData")
save(phylo_otus, file = "data/phyloseq/otus.RData")